# 
# Getting started with R
# Vladan Devedzic, Oct 02, 2017
# 


# Download R from https://cran.r-project.org/ and install it.
# Download RStudio from https://www.rstudio.com/products/rstudio/download/ and install it.

# Open new project in RStudio (File > New Project...).
# Open new R script in the project (File > New File > R Script).


# Get working directory
getwd()

# Set working directory (Session > Set Working Directory)


# Help
?getwd

# Install packages



############################################
# Basic commands, operators and data types #
############################################

# Print a line of text in the console:
# print("Something")
print("Hi :)")

# Naming and coding conventions: https://google.github.io/styleguide/Rguide.xml

# Assignment statement: x <- <something>
a <- 4

# Manipulating objects in the workspace:
# ls()                      # list all objects in memory
# rm(<o1>, <o2>, <o3>, ...) # remove one or more objects from memory by their names
# rm(list = ls())           # remove all objects from memory (usually not recommended)
rm(a)
ls()

# Operators:
# +	  Add, 2 + 3 = 5
# -	  Subtract, 5 - 2 = 3
# *	  Multiply, 2 * 3 = 6
# /	  Divide, 6 / 2 = 3
# ^	  Exponent, 2 ^ 3 = 8
# %%	Modulus operator, 9%%2 = 1
# %/%	Integer division, 9 %/% 2 = 4
# <	  Less than
# >	  Greater than
# =	  Equal to
# <=	Less than or equal to
# >=	Greater than or equal to
# !=	Not equal to
# !	  Not
# |	  OR
# &	  And

# Expressions:
# <x> / <y> - <z>^2 ...
3 * 54 - 2 ^ 4

# Vectors:
# <y> <- c(<something1>, <something2>, <something3>, ...)
# <y> <- rep(<something>, <times>)
# <y> <- <int1>:<int2>
# <y> <- seq(<value1>, <value2>, by = <step>)
# length(<y>)
# The index of the first element in a vector is 1, not 0.
x <- c(2, 5, 6, 7, 12)
x[1]
y <- rep(4, 123)
y <- 12:34
y
length(y)

# Data types: vector, factor, numeric, character, logical, data.frame, matrix, list, ...
# class(<something>)                    # data type
# mode(something), typeof(<something>)  # how a data item is internally stored in memory


# Factors:
# b <- c(1, 2, 2, 2, 3, 1, 1, 4, 5, 4)
# b.as.factor <- as.factor(b)
# levels(b.as.factor)
# f <- factor(c(1, 2, 3))
b <- c(1, 2, 2, 2, 3, 1, 1, 4, 5, 4)
b.f <- as.factor(b)
levels(b.f)
levels(b.f) <- c("one", "two", "three", "four", "five")

# Dataframes:
# e.g., <dataframe> <- as.data.frame(<matrix>)
# str(<dataframe>)
actor <- 
  data.frame(name = 
               c("Chris Abbott", 
                 "Charlize Therron", 
                 "Halle Berry", 
                 "Ryan Gosling"), 
             sex = c("Male", "Female", "Female", "Male"), 
             movies.2016 = c(2, 3, 2, 1))
actor$name <- as.character(actor$name)

###########
# ggplot2 #
###########

# install.packages("ggplot2")
# library(ggplot2)
install.packages("ggplot2")
library(ggplot2)


# Data to plot:


# Bar graphs:
# ggplot(data = <dataframe>, 
#        aes(x = <column 1>, y = <column 2>, fill = <column 1>)) +  # fill = <column 1> is optional; no y for counts
#   geom_bar(stat = "identity") +                                   # "identity" for values, "count" for counts
#   xlab("<x-axis label>") + ylab("<y-axis label>") +
#   ggtitle("<graph title>")
ggplot(data = actor,
       aes(x = name, y = movies.2016, fill = name)) +  # fill = <column 1> is optional; no y for counts
  geom_bar(stat = "identity") +                                   # "identity" for values, "count" for counts
  xlab("Actor name") + ylab("Movies 2016") +
  ggtitle("Movies")



######################################
# Working with datasets / dataframes #
######################################

# Reading a dataset:
# <dataframe> <- read.csv("<filename>", stringsAsFactors = FALSE)
# str(<dataframe>)  # structure of <dataframe>, all variables/columns
# head(<dataframe>) # the first few rows
# tail(<dataframe>) # the last few rows
the.beatles.songs <- read.csv("The Beatles songs dataset, v1.csv", stringsAsFactors = FALSE)

# Examining a dataframe:
# str(<dataframe>)              # structure of <dataframe>, all variables/columns
# dim(<dataframe>)              # showing dimensions (numbers of rows and columns) of a dataframe
# names(<dataframe>)            # showing column names
# head(<dataframe>)             # the first few rows
# tail(<dataframe>)             # the last few rows
# <dataframe>[ , ]              # the entire dataframe
# <dataframe>                   # the entire dataframe
# <dataframe>[<m>, ]            # m-th row
# <dataframe>[ ,<n>]            # n-th column
# summary(<dataframe>$<column>) # summarizing a variable/column values
# fix(<dataframe>)              # editing a dataframe
# new.df <- edit(<dataframe>)   # editing a dataframe and assigning the modified dataframe to another datavrame
str(the.beatles.songs)
dim(the.beatles.songs)
summary(the.beatles.songs$Top.50.Billboard)

# Examining a dataframe visually, with ggplot():
# It is better to work with a "clean" dataset (no NAs, no duplicated values,...):
# <clean dataframe> <- read.csv("<filename>", stringsAsFactors = FALSE)
the.beatles.songs <- read.csv("The Beatles songs dataset, v1, no NAs.csv")

# For the sake of these examples, convert Year to factor, because write.csv/read.csv produces int's
the.beatles.songs$Year <- as.factor(the.beatles.songs$Year)

# Bargraph Duration(Year):
# ggplot(data = <dataframe>, 
#        aes(x = <column 1>, y = <column 2>, fill = <column 1>)) +  # fill = <column 1> is optional; no y for counts
#   geom_bar(stat = "identity") +                                   # "identity" for values, "count" for counts
#   xlab("<x-axis label>") + ylab("<y-axis label>") +
#   ggtitle("<graph title>")
# ggplot(data = <dataframe>, 
#        aes(x = <column 1>, fill = <column 1>)) +  # fill = <column 1> is optional; no y for counts
#   geom_bar(stat = "identity") +                   # "count" for counts
#   xlab("<x-axis label>") + 
#   ylab("<y-axis label>") +
#   ggtitle("<graph title>")
ggplot(data = the.beatles.songs,
       aes(x = Year, y = Duration, fill = Year)) +  # fill = <column 1> is optional; no y for counts
  geom_bar(stat = "identity") +                                   # "identity" for values, "count" for counts
  xlab("Year") + ylab("Duration") +
  ggtitle("Songs")
ggplot(data = the.beatles.songs,
       aes(x = Year, fill = Year)) +  # fill = <column 1> is optional; no y for counts
  geom_bar(stat = "count") +                                   # "identity" for values, "count" for counts
  xlab("Year") + ylab("Duration") +
  ggtitle("Songs")


# Linegraph Duration(Year):
# ggplot(data = <dataframe>[<i>:<k>, ],             # plot line graph for a subset <i>:<k> of rows
#        aes(x = <column 1>, y = <column 2>, 
#            group = 1)) +                          # group = 1: connect all points
#   geom_line(color = "<color>", 
#             size = 2,                             # line thickness
#             linetype = "<linetype>") +            # http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
#   geom_point(color = "<color>", 
#              shape = 25,                          # http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
#              size = 8, fill = "<fill color>") + 
#   xlab("<x-axis label>") + ylab("<y-axis label>") +
#   ggtitle("<graph title>")


# Adding/Removing columns to/from a dataframe:
# <dataframe>$<new column name> <- <default value>  # adding a new column (default values)
# <dataframe>$<column name> <- NULL                 # removing a column


# Changing column names:
# colnames(<dataframe>)[i] <- "<new name>"
colnames(the.beatles.songs)[9] <- "BB"

# Changing row names:
# rownames(<dataframe>)[i] <- "<new name>"
# rownames(<dataframe>) <- c("<new name 1>", "<new name 2>",...)
# rownames(<dataframe>) <- c(1, 2,...)
# rownames(<dataframe>) <- list("<new name 1>", <numeric 2>,...)
rownames(the.beatles.songs)[1] <- "song1"

# Saving a dataset (modified or newly created dataset):
# write.csv(x = <dataframe>, file = "<filename>", row.names = F)  # do not include the row names (row numbers) column
# saveRDS(object = <dataframe or another R object>, file = "<filename>")  # save R object for the next session
# <dataframe or another R object> <- readRDS(file = "<filename>")         # restore R object in the next session
saveRDS(object = the.beatles.songs, file = "The Beatles songs, v2.RData")  # save R object for the next session


#######################
# Working with tables #
#######################

# The table() function:
# table(<var>)  # typically a factor or an integer var
table(the.beatles.songs$Year)

# The prop.table() function:
# prop.table(table(<var>))
# round(prop.table(table(<var>)), digits = <n>)
prop.table(table(the.beatles.songs$Year))
round(prop.table(table(the.beatles.songs$Year)), digits = 3)

# Example: converting the.beatles.songs$Year to factor and showing it in tables



###################################
# Resources, readings, references #
###################################

# R Tutorials, http://www.endmemo.com/program/R/
# R: A Beginner's Guide (by Sharon Machlis), http://www.tfrec.wsu.edu/TFREConly/r4beginners_v3.pdf
# Graphs with ggplot2, http://www.cookbook-r.com/Graphs/
